Skip to contents

#WORK IN PROGRESS

Summary

This article focuses on the spatial representation of dissimilarity scores.

TODO

Setup

library(distantia, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(mapview, warn.conflicts = FALSE)

Example Data

The examples below illustrate how to map dissimilarity scores using the datasets covid_prevalence and covid_polygons included with distantia. The dataset comprises time series of weekly Covid-19 prevalence in several California counties.

tsl <- distantia::tsl_initialize(
  x = distantia::covid_prevalence,
  name_column = "name",
  time_column = "time"
)

distantia::tsl_plot(
  tsl = tsl[1:10],
  columns = 2,
  guide = FALSE
)

The map below displays the polygons in distantia::covid_counties using mapview().

mapview::mapview(
  distantia::covid_counties,
  label = "name"
)

Dissimilarity Analysis

In this section we prepare several datasets to use in the different mapping examples: a lock-step dissimilarity data frame (df_psi), dissimilarity stats per time series (df_stats), and a hierarchical clustering based on the dissimilarity scores (df_cluster).

The lock-step dissimilarity analysis includes a permutation test p-values, which will be useful as criteria to select relevant mapping features.

#parallelization setup
future::plan(
  future::multisession,
  workers = parallelly::availableCores() - 1
  )

#lock-step dissimilarity analysis
df_psi <- distantia::distantia(
  tsl = tsl,
  distance = "euclidean",
  lock_step = TRUE,
  repetitions = 1000,
  permutation = "restricted",
  block_size = 12 #weeks
)

#disable parallelization
future::plan(
  future::sequential
  )

#check resulting data frame
df_psi |> 
  dplyr::select(
    x,
    y,
    psi,
    p_value
  ) |> 
  dplyr::glimpse()
#> Rows: 630
#> Columns: 4
#> $ x       <chr> "Napa", "Alameda", "Alameda", "Sacramento", "San_Joaquin", "Sa…
#> $ y       <chr> "Solano", "San_Mateo", "Contra_Costa", "Sonoma", "Stanislaus",…
#> $ psi     <dbl> 0.8726115, 1.0656371, 1.1620553, 1.2578125, 1.2919255, 1.29793…
#> $ p_value <dbl> 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,…

The code below aggregates psi scores in the data frame df_psi across time series to summarize the overall dissimilarity of each time series with all others.

df_stats <- distantia::distantia_stats(
  df = df_psi
)
#> Loading required package: foreach
#> Loading required package: future

df_stats |> 
  dplyr::select(
    name, mean
  ) |> 
  dplyr::glimpse()
#> Rows: 36
#> Columns: 2
#> $ name <chr> "Alameda", "Butte", "Contra_Costa", "El_Dorado", "Fresno", "Humbo…
#> $ mean <dbl> 3.214531, 3.266749, 3.358524, 3.546757, 3.270684, 4.028824, 3.834…

Finally, we run a hierarchical clustering using the psi scores in df_psi as clustering criteria.

#hierarchical clustering of lock-step dissimilarity
df_cluster <- distantia::distantia_cluster_hclust(
  df = df_psi
)$df

dplyr::glimpse(df_cluster)
#> Rows: 36
#> Columns: 3
#> $ name             <chr> "Napa", "Alameda", "Sacramento", "San_Joaquin", "Sant…
#> $ cluster          <dbl> 1, 2, 2, 1, 3, 2, 2, 3, 2, 1, 1, 1, 2, 1, 2, 4, 2, 1,…
#> $ silhouette_width <dbl> 0.33267984, 0.25073529, 0.11016028, 0.42375200, 0.431…

Dissimilarity Networks

The function distantia_spatial_network() transforms the result of distantia() to an sf data frame with edges connecting time series locations. The result can be interpreted as a dissimilarity network.

sf_network <- distantia::distantia_spatial_network(
  df = df_psi,
  sf = distantia::covid_counties |> 
    dplyr::select(
      name, geometry
    )
)

dplyr::glimpse(sf_network)
#> Rows: 630
#> Columns: 9
#> $ edge_name <chr> "Napa - Solano", "Alameda - San_Mateo", "Alameda - Contra_Co…
#> $ y         <chr> "Solano", "San_Mateo", "Contra_Costa", "Sonoma", "Stanislaus…
#> $ x         <chr> "Napa", "Alameda", "Alameda", "Sacramento", "San_Joaquin", "…
#> $ psi       <dbl> 0.8726115, 1.0656371, 1.1620553, 1.2578125, 1.2919255, 1.297…
#> $ p_value   <dbl> 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00…
#> $ null_mean <dbl> 2.610115, 2.532170, 2.406356, 2.742062, 2.882596, 2.857959, …
#> $ null_sd   <dbl> 0.2271616, 0.2759364, 0.2829994, 0.3067348, 0.2033813, 0.235…
#> $ geometry  <LINESTRING [°]> LINESTRING (-122.3298 38.50..., LINESTRING (-121.…
#> $ length    <dbl> 43101.68, 46400.19, 30278.10, 135022.90, 48050.18, 33550.83,…

The sf data frame has a field with the edge name, the columns x, y, and psi of the distantia data frame, a linestring geometry column, and the length column with the edge length.

This sf data frame can be mapped right away, but in this case there are too many pairs of counties to achieve a meaningful map

mapview::mapview(
  sf_network,
  layer.name = "Psi",
  label = "edge_name",
  zcol = "psi",
  color = distantia::utils_color_continuous_default()
)

The mess above can be solved by focusing on particular aspects of the data at hand. For example, below we subset sf_network to focus on San Francisco and its most similar counties in terms of Covid19 prevalence.

#subset edges involving San Francisco
#with a maximum length of 250km
sf_network_subset <- sf_network |> 
  dplyr::filter(
    grepl(
      pattern = "San_Francisco",
      x = edge_name
    ),
    p_value <= 0.05
  )

#map country polygons and dissimilarity edges
mapview::mapview(
  covid_counties,
  col.regions = NA,
  color = "black",
  label = "name",
  legend = FALSE,
  map.type = "OpenStreetMap"
) +
  mapview::mapview(
    sf_network_subset,
    layer.name = "San Francisco - Psi",
    label = "edge_name",
    zcol = "psi",
    lwd = (1/sf_network_subset$psi)*10,
    color = distantia::utils_color_continuous_default(n = nrow(sf_network_subset) - 1)
  )

Mapping Dissimilarity Stats

Mapping the dissimilarity stats of each time series may help identify places that are somewhat special because they show a high dissimilarity with all others, or places that are average and have no distinctive features.

Merging the dissimilarity stats with the sf data frame containing the county polygons generates the spatial data required for the map.

sf_stats <- merge(
  x = distantia::covid_counties,
  y = df_stats
)

The map below uses warm colors for places with a high dissimilarity with all others such as Humboldt county in the north west of the state, and cold colors for places with a higher dissimilarity with all others.

mapview::mapview(
  sf_stats |> 
    dplyr::select(
      mean,
      name
    ),
  layer.name = "Psi mean",
  zcol = "mean",
  label = "name",
  col.regions = distantia::utils_color_continuous_default(),
  alpha.regions = 1
)